library(plotly)
library(data.table)
library(tidyr)
library(knitr)
library(heatmaply)

Preprocessing

  • Load data file
  • rename genres for better readability
    • “Religion, Spirituality & New Age” to “Religion”
    • “Science.fiction” to “SciFi”
    • “Action.and.Adventure” to “Action”

All genres:

 [1] "Satire"        "SciFi"         "Drama"         "Action"        "Romance"       "Mystery"       "Horror"       
 [8] "Self.help"     "Health"        "Guide"         "Travel"        "Children.s"    "Religion"      "Science"      
[15] "History"       "Math"          "Anthology"     "Poetry"        "Encyclopedias" "Dictionaries"  "Comics"       
[22] "Art"           "Cookbooks"     "Diaries"       "Journals"     
  • Check if upper and lower triangle identical
[1] TRUE
  • Transform to long and tidy data.table
head(books_dt)
  • Average number of genres per customer
[1] 2.332187

First ideas

Show me everything!

  • Romance, SciFi, Action, History are most bought
  • bought-together clusters:
    • Romance, SciFi, Action, History
    • Dictionaries and Comics
    • Math and Poetry
  • Mystery is an outlier

Most bought genre

Best pairs

  • mostly combinations of most bought genres

Special genres

Hypothesis

  • If a customer buys more than 2 genres, he is recorded in more than 1 off-diagonal entry:
    • (2*diagonal - colSum) < 0
  • If a genre is bought more often alone than in triplets (or higher):
    • (2*diagonal - colSum) > 0

Look for customers that buy only one genre

  • Compare column sum and 2*diagonal value
  • generate table with {genre, {2*diagonal-colSum}}

Normalize columns by value of diagonal

With clustering of rows and columns

Relative best pairs

LS0tCnRpdGxlOiAiQWxsaWFueiBEYXRhVml6IENoYWxsZW5nZSIKYXV0aG9yOiAiRGFuaWVsIEJhZGVyIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB5ZXMKLS0tCgpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgZWNobz1UfQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShkYXRhLnRhYmxlKQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGtuaXRyKQpsaWJyYXJ5KGhlYXRtYXBseSkKYGBgCgoKYGBge3IsIGVjaG89RkFMU0V9Cm9wdHNfY2h1bmskc2V0KGVjaG89RkFMU0UsIGNhY2hlPUYpCnRvdGFsX2N1c3RvbWVycyA8LSAxOTUzODcKZmlsZV9ib29rc3RvcmUgPC0gZmlsZS5wYXRoKCJ+L0Rvd25sb2Fkcy90b3lkYXRhL2Jvb2tfZ2VucmVzX2RhdGEuY3N2IikKc291cmNlKCJidWlsZF9ib29rX3N0b3JlLlIiKQpgYGAKCgojIFByZXByb2Nlc3NpbmcKCiogTG9hZCBkYXRhIGZpbGUKKiByZW5hbWUgZ2VucmVzIGZvciBiZXR0ZXIgcmVhZGFiaWxpdHkKICAgICogIlJlbGlnaW9uLCBTcGlyaXR1YWxpdHkgJiBOZXcgQWdlIiB0byAiUmVsaWdpb24iCiAgICAqICJTY2llbmNlLmZpY3Rpb24iIHRvICJTY2lGaSIKICAgICogIkFjdGlvbi5hbmQuQWR2ZW50dXJlIiB0byAiQWN0aW9uIgogICAgCkFsbCBnZW5yZXM6CmBgYHtyfQpib29rc19tYXQgPC0gcmVhZC5jc3YoZmlsZV9ib29rc3RvcmUsIHJvdy5uYW1lcyA9IDEpCnJvd25hbWVzKGJvb2tzX21hdCkgPC0gbWFrZS5uYW1lcyhyb3duYW1lcyhib29rc19tYXQpKQpyb3duYW1lcyhib29rc19tYXQpIDwtIHN1YigiU2NpZW5jZS5maWN0aW9uIiwgIlNjaUZpIiwgcm93bmFtZXMoYm9va3NfbWF0KSkKcm93bmFtZXMoYm9va3NfbWF0KSA8LSBzdWIoIkFjdGlvbi5hbmQuQWR2ZW50dXJlIiwgIkFjdGlvbiIsIHJvd25hbWVzKGJvb2tzX21hdCkpCnJvd25hbWVzKGJvb2tzX21hdCkgPC0gc3ViKCJSZWxpZ2lvbi4uU3Bpcml0dWFsaXR5Li4uTmV3LkFnZSIsIAogICAgIlJlbGlnaW9uIiwgcm93bmFtZXMoYm9va3NfbWF0KQopCmNvbG5hbWVzKGJvb2tzX21hdCkgPC0gcm93bmFtZXMoYm9va3NfbWF0KQpyb3duYW1lcyhib29rc19tYXQpCmBgYAoKKiBDaGVjayBpZiB1cHBlciBhbmQgbG93ZXIgdHJpYW5nbGUgaWRlbnRpY2FsCgpgYGB7cn0KaXNfdXBwZXJfbG93ZXIgPC0gaWRlbnRpY2FsKAogICAgYm9va3NfbWF0W3VwcGVyLnRyaShib29rc19tYXQpXSwgCiAgICB0KGJvb2tzX21hdClbdXBwZXIudHJpKGJvb2tzX21hdCldCikKaXNfdXBwZXJfbG93ZXIKYGBgCgoqIFRyYW5zZm9ybSB0byBsb25nIGFuZCB0aWR5IGBkYXRhLnRhYmxlYAoKYGBge3J9CmJvb2tzX2R0IDwtIGFzLmRhdGEudGFibGUoYm9va3NfbWF0LCBrZWVwLnJvd25hbWVzID0gVFJVRSkKc2V0bmFtZXMoYm9va3NfZHQsIGMoJ2dlbnJlQScsY29sbmFtZXMoYm9va3NfbWF0KSkpCmJvb2tzX2R0IDwtIGFzLmRhdGEudGFibGUoZ2F0aGVyKGJvb2tzX2R0LCBnZW5yZUIsIGN1c3RvbWVycywgU2F0aXJlOkpvdXJuYWxzKSkKYGBgCgpgYGB7ciwgZWNobz1UfQpoZWFkKGJvb2tzX2R0KQpgYGAKCgoqIEF2ZXJhZ2UgbnVtYmVyIG9mIGdlbnJlcyBwZXIgY3VzdG9tZXIKCmBgYHtyfQpzdW0oYm9va3NfZHRbZ2VucmVBPT1nZW5yZUIsIGN1c3RvbWVyc10pL3RvdGFsX2N1c3RvbWVycwpgYGAKCgojIEZpcnN0IGlkZWFzCgojIyBTaG93IG1lIGV2ZXJ5dGhpbmchCgpgYGB7ciwgZmlnLndpZHRoPTgsIGZpZy5oZWlnaHQ9OH0KaG0gPC0gaGVhdG1hcHIoYm9va3NfbWF0KQpoZWF0bWFwbHkoaG0sIAogICAgcGxvdF9tZXRob2QgPSAncGxvdGx5JywgCiAgICBjb2xvcnMgPSAgYygnZ3JleTk1JywgJ2RvZGdlcmJsdWUnKQopCmBgYAoKKiBSb21hbmNlLCBTY2lGaSwgQWN0aW9uLCBIaXN0b3J5IGFyZSBtb3N0IGJvdWdodCAKKiBib3VnaHQtdG9nZXRoZXIgY2x1c3RlcnM6CiAgICAqIFJvbWFuY2UsIFNjaUZpLCBBY3Rpb24sIEhpc3RvcnkKICAgICogRGljdGlvbmFyaWVzIGFuZCBDb21pY3MKICAgICogTWF0aCBhbmQgUG9ldHJ5CiogTXlzdGVyeSBpcyBhbiBvdXRsaWVyCgojIyBNb3N0IGJvdWdodCBnZW5yZQoKYGBge3J9CnBsb3RfbHkoZGF0YT1ib29rc19kdFtnZW5yZUE9PWdlbnJlQl1bb3JkZXIoY3VzdG9tZXJzKV0sIAogICAgeD1+Z2VucmVBLCB5PX5jdXN0b21lcnMsIHR5cGU9ImJhciIKKSU+JSBsYXlvdXQoCiAgICBtYXJnaW49bGlzdChiPTEwMCksIAogICAgeGF4aXM9bGlzdChjYXRlZ29yeW9yZGVyPSJ0cmFjZSIpLAogICAgdGl0bGU9Ik1vc3QgYm91Z2h0IGdlbnJlIgopCmBgYAoKIyMgQmVzdCBwYWlycwoKYGBge3J9CmFsbF9nZW5yZXMgPC0gdW5pcXVlKGJvb2tzX2R0JGdlbnJlQSkKYWxsX3BhaXJzIDwtIGNvbWJuKGFsbF9nZW5yZXMsIDIsIHNpbXBsaWZ5ID0gRikKcGFpcl9jdXN0b21lcnMgPC0gCnBhaXJfZHQgPC0gZGF0YS50YWJsZSgKICAgIGdlbnJlX3BhaXJzID0gc2FwcGx5KGFsbF9wYWlycywgZnVuY3Rpb24ocCl7CiAgICAgICAgcGFzdGUoc29ydChwKSwgY29sbGFwc2UgPSAiJiIpfQogICAgKSwKICAgIHBhaXJfY3VzdG9tZXJzID0gc2FwcGx5KGFsbF9wYWlycywgZnVuY3Rpb24ocCl7CiAgICAgICAgYm9va3NfZHRbZ2VucmVBPT1wWzFdICYgZ2VucmVCPT1wWzJdLCBjdXN0b21lcnNdCiAgICB9KQopCnBsb3RfbHkoZGF0YT1wYWlyX2R0W29yZGVyKHBhaXJfY3VzdG9tZXJzLCBkZWNyZWFzaW5nPVQpXVsxOjEwXSwgdHlwZT0nYmFyJywgCiAgICB4PX5nZW5yZV9wYWlycywgeT1+cGFpcl9jdXN0b21lcnMKKSU+JSBsYXlvdXQoCiAgICBtYXJnaW49bGlzdChiPTEwMCksIAogICAgeGF4aXM9bGlzdChjYXRlZ29yeW9yZGVyPSJ0cmFjZSIpLAogICAgdGl0bGU9IlRvcCAxMCBnZW5yZSBwYWlycyIKKQpgYGAKCiogbW9zdGx5IGNvbWJpbmF0aW9ucyBvZiBtb3N0IGJvdWdodCBnZW5yZXMKCgojIFNwZWNpYWwgZ2VucmVzCgpIeXBvdGhlc2lzCgoqIElmIGEgY3VzdG9tZXIgYnV5cyBtb3JlIHRoYW4gMiBnZW5yZXMsIApoZSBpcyByZWNvcmRlZCBpbiBtb3JlIHRoYW4gMSBvZmYtZGlhZ29uYWwgZW50cnk6CiAgICAqICgyKmRpYWdvbmFsIC0gY29sU3VtKSA8IDAKKiBJZiBhIGdlbnJlIGlzIGJvdWdodCBtb3JlIG9mdGVuIGFsb25lIHRoYW4gaW4gdHJpcGxldHMgKG9yIGhpZ2hlcik6IAogICAgKiAoMipkaWFnb25hbCAtIGNvbFN1bSkgPiAwCgoKTG9vayBmb3IgY3VzdG9tZXJzIHRoYXQgYnV5IG9ubHkgb25lIGdlbnJlCgoqIENvbXBhcmUgYGNvbHVtbiBzdW1gIGFuZCAgYDIqZGlhZ29uYWwgdmFsdWVgCiogZ2VuZXJhdGUgdGFibGUgd2l0aCBge2dlbnJlLCB7MipkaWFnb25hbC1jb2xTdW19fWAKCgpgYGB7ciwgd2FybmluZz1GQUxTRSwgZmlnLndpZHRoPTh9CmFsbF9nZW5yZXMgPC0gdW5pcXVlKGJvb2tzX2R0JGdlbnJlQSkKc2VsZWN0aXZlX2R0IDwtIGRhdGEudGFibGUoKQpmb3IoZyBpbiBhbGxfZ2VucmVzKXsKICAgIGQgPC0gYm9va3NfZHRbZ2VucmVBPT1nICYgZ2VucmVCPT1nLCBjdXN0b21lcnNdCiAgICBjcyA8LSBzdW0oYm9va3NfZHRbZ2VucmVBPT1nLCBjdXN0b21lcnNdKQogICAgZGQgPC0gSSgyKmQgLSBjcykKICAgIHNlbGVjdGl2ZV9kdCA8LSByYmluZChzZWxlY3RpdmVfZHQsIGRhdGEudGFibGUoZ2VucmU9ZywgZGlhZ19kaWZmPWRkKSkKfQoKcF9zZWwgPC0gcGxvdF9seShkYXRhPXNlbGVjdGl2ZV9kdFtvcmRlcihkaWFnX2RpZmYpXSwgCiAgICB5PX5nZW5yZSwgeD1+ZGlhZ19kaWZmLCB0eXBlPSJiYXIiLCBjb2xvciA9IH5kaWFnX2RpZmY+MAopJT4lIGxheW91dCgKICAgIG1hcmdpbj1saXN0KGw9MTAwKSwgCiAgICB5YXhpcz1saXN0KGNhdGVnb3J5b3JkZXI9InRyYWNlIiwgdGl0bGU9JycpLAogICAgeGF4aXM9bGlzdCh0aXRsZT0nMipkaWFnb25hbCAtIGNvbHVtblN1bScpLAogICAgdGl0bGU9IldoaWNoIGdlbnJlcyBhcmUgYm91Z2h0IGFsb25lPyIKKQoKc2hvdyhwX3NlbCkKYGBgCgoKIyBOb3JtYWxpemUgY29sdW1ucyBieSB2YWx1ZSBvZiBkaWFnb25hbAoKYGBge3J9CmJvb2tzX2R0WywKICAgIHJlbF9jdXN0b21lcnM6PSAoY3VzdG9tZXJzL2Jvb2tzX2R0W2dlbnJlQT09Z2VucmVCLCBjdXN0b21lcnNdKSwgCiAgICBieT1nZW5yZUIKICAgIF0KaGVhZChib29rc19kdFtvcmRlcihnZW5yZUEpXSkKYGBgCmBgYHtyLCBmaWcud2lkdGg9OCwgZmlnLmhlaWdodD04fQpwbG90X2x5KGRhdGE9Ym9va3NfZHQpICU+JQogICAgYWRkX2hlYXRtYXAoCiAgICAgICAgej1+cmVsX2N1c3RvbWVycywgeD1+Z2VucmVBLCB5PX5nZW5yZUIsIGNvbG9ycz0gYygnZ3JleTk1JywgJ2RvZGdlcmJsdWUnKQogICAgKSAlPiUKICAgIGxheW91dCgKICAgICAgICBtYXJnaW49bGlzdChiPTExMCwgbD0xMTApCiAgICApCmBgYAoKV2l0aCBjbHVzdGVyaW5nIG9mIHJvd3MgYW5kIGNvbHVtbnMKYGBge3IsIGZpZy53aWR0aD04LCBmaWcuaGVpZ2h0PTh9CmJvb2tzX3JlbG1hdCA8LSBkY2FzdChib29rc19kdCwgZ2VucmVBIH4gZ2VucmVCLCB2YWx1ZS52YXIgPSAicmVsX2N1c3RvbWVycyIpCmJvb2tzX3JlbG1hdCA8LSBhcy5tYXRyaXgoYm9va3NfcmVsbWF0WyxnZW5yZUE6PU5VTExdKQpyb3duYW1lcyhib29rc19yZWxtYXQpIDwtIGNvbG5hbWVzKGJvb2tzX3JlbG1hdCkKCmhlYXRtYXBseShoZWF0bWFwcihib29rc19yZWxtYXQpLCAKICAgIHBsb3RfbWV0aG9kID0gJ3Bsb3RseScsIAogICAgY29sb3JzID0gIGMoJ2dyZXk5NScsICdkb2RnZXJibHVlJykKKQpgYGAKClJlbGF0aXZlIGJlc3QgcGFpcnMKYGBge3J9CnBsb3RfbHkoCiAgICAgICAgZGF0YSA9IGJvb2tzX2R0W2dlbnJlQSAhPSBnZW5yZUJdW29yZGVyKHJlbF9jdXN0b21lcnMsIGRlY3JlYXNpbmcgPSBUKV1bMToxMF0KICAgICkgJT4lIAogICAgYWRkX2JhcnMoCiAgICAgICAgeD1+cGFzdGUwKGdlbnJlQSwgIiYiLCBnZW5yZUIpLCB5PX5yZWxfY3VzdG9tZXJzCiAgICAgICAgKSAlPiUgCiAgICBsYXlvdXQoCiAgICAgICAgbWFyZ2luPWxpc3QoYj0xMDAsIHI9ODApLCAKICAgICAgICB4YXhpcz1saXN0KGNhdGVnb3J5b3JkZXI9InRyYWNlIiwgdGl0bGU9JycpLAogICAgICAgIHlheGlzPWxpc3QoZXhwb25lbnRmb3JtYXQ9J25vbmUnKSwKICAgICAgICB0aXRsZT0iVG9wIDEwIHJlbGF0aXZlIGdlbnJlIHBhaXJzIgopCmBgYAoKCg==